import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from geopy.geocoders import Nominatim
from IPython.display import Image
from datetime import datetime, timedelta, time
import keras
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, LSTM, TimeDistributed, Conv2D, MaxPooling2D, Flatten, Dropout
# from keras.utils import plot_model
# from keras.layers.convolutional import Conv3D
from keras.layers.convolutional_recurrent import ConvLSTM2D
from keras.layers.normalization import BatchNormalization
from sklearn import metrics
stations = pd.read_json('https://feeds.citibikenyc.com/stations/stations.json')
stations.head()
stationBeanList = pd.json_normalize(stations['stationBeanList'])
stations = pd.concat([stations, stationBeanList], axis=1)
stations.head(2)
stations.drop('stationBeanList', axis=1, inplace = True)
stations[['executionTime','lastCommunicationTime']]= stations[['executionTime','lastCommunicationTime']].apply(pd.to_datetime, errors='coerce')
stations_summary = pd.concat([stations.count(),stations.dtypes, stations.nunique(),stations.isnull().sum()], axis=1, sort=False)
stations_summary.columns=['count','dtype','nunique','NaN']
stations_summary
We have the data of 509 citi bike NYC stations taken on 2016-01-22 16:32:49
stations['statusValue'].value_counts()
stations.describe()
The total docks mean is of 32 with a standard deviation of 11. The maximum per station reaches 67 docks. 75% of them having 39 docks.
On 2016-01-22 16:32:4, we also observe the following figures:
The available docks mean was of 21 with a standard deviation of 12. The maximum per station reached 62 docks. 75% of them had 28 available docks.
The available bikes mean was of 10 with a standard deviation of 10. The maximum per station reached 46 bikes. 75% of them had 16 available bikes.
print('longitude min {} max {}'.format(stations.longitude.min(), stations.longitude.max()))
print('latitude min {} max {}'.format(stations.latitude.min(), stations.latitude.max()))
plt.plot(stations.longitude, stations.latitude, 'b.')
# Using the Geopy library
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='bike_stations')
def rever_geo(x):
try:
result = str(geolocator.reverse((x['latitude'], x['longitude'])))
return result.split(', ')[4]
except Exception:
pass
# Geopy speed is to improve/replace
stations['borough'] = stations.apply(rever_geo, axis=1)
stations['borough'].value_counts()
stations['borough'].fillna('New York',inplace=True)
stations.loc[stations['borough'].str.contains('Manhattan'), 'borough'] = 'Manhattan'
stations.loc[stations['borough'].str.contains('Times Square'), 'borough'] = 'Manhattan'
stations.loc[stations['borough'].str.contains('10075-0381'), 'borough'] = 'Manhattan'
stations.loc[stations['borough'].str.contains('New York County'), 'borough'] = 'Manhattan'
stations.loc[stations['borough'].str.contains('Kings County'), 'borough'] = 'Brooklyn'
stations.loc[stations['borough'].str.contains('11211'), 'borough'] = 'Brooklyn'
stations.loc[stations['borough'].str.contains('Queens'), 'borough'] = 'Queens'
# Jersey City and Hudson County are in New jersey which is not considered part of new york city boroughs. We will classify them under STATEN ISLAND for longitude proximity
stations.loc[stations['borough'].str.contains('New Jersey'), 'borough'] = 'Staten Island'
stations.loc[stations['borough'].str.contains('Jersey City'), 'borough'] = 'Staten Island'
stations.loc[stations['borough'].str.contains('Hudson County'), 'borough'] = 'Staten Island'
stations['borough'].value_counts()
fig = px.scatter(stations, x=stations.longitude,y=stations.latitude, size=stations.totalDocks, hover_name=stations.stationName, title='Stations coordinates', color=stations.borough)
fig.show()
Manhattan, Brooklyn, Queens and Staten Island boroughs are clearly showing. The "New York" labelled data is spread over the map. This can be further cleaned. The bronx label is missing. It should be included in parts of the Manhattan data. This can also be further adjusted (for example by selecting parts of the new Manhattan data from some latitude level).
The Motor Vehicle Collisions data tables contain information from all police reported motor vehicle collisions in NYC. The police report (MV104-AN) is required to be filled out for collisions where someone is injured or killed, or where there is at least $1000 worth of damage (https://www.nhtsa.gov/sites/nhtsa.dot.gov/files/documents/ny_overlay_mv-104an_rev05_2004.pdf). It should be noted that the data is preliminary and subject to change when the MV-104AN forms are amended based on revised crash details.For the most accurate, up to date statistics on traffic fatalities, please refer to the NYPD Motor Vehicle Collisions page (updated weekly) or Vision Zero View (updated monthly).
TrafficStat, which uses the CompStat model to work towards improving traffic safety. Police officers complete form MV-104AN for all vehicle collisions. The MV-104AN is a New York State form that has all of the details of a traffic collision. Before implementing Trafficstat, there was no uniform traffic safety data collection procedure for all of the NYPD precincts. Therefore, the Police Department implemented the Traffic Accident Management System (TAMS) in July 1999 in order to collect traffic data in a uniform method across the City. TAMS required the precincts manually enter a few selected MV-104AN fields to collect very basic intersection traffic crash statistics which included the number of accidents, injuries and fatalities. As the years progressed, there grew a need for additional traffic data so that more detailed analyses could be conducted. The Citywide traffic safety initiative, Vision Zero started in the year 2014. Vision Zero further emphasized the need for the collection of more traffic data in order to work towards the Vision Zero goal, which is to eliminate traffic fatalities. Therefore, the Department in March 2016 replaced the TAMS with the new Finest Online Records Management System (FORMS). FORMS enables the police officers to electronically, using a Department cellphone or computer, enter all of the MV-104AN data fields and stores all of the MV-104AN data fields in the Department’s crime data warehouse. Since all of the MV-104AN data fields are now stored for each traffic collision, detailed traffic safety analyses can be conducted as applicable.
https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95
Date Made Public: 5/7/2014
Agency: Police Department (NYPD)
Data Collection: Data Collection Motor Vehicle Collisions
Attachments: MVCollisionsDataDictionary_20190813_ERD.xlsx
# from google.colab import drive
# drive.mount('/content/drive')
# collisions = pd.read_csv('/content/drive/My Drive/Motor_Vehicle_Collisions_-_Crashes.csv',parse_dates=[['CRASH DATE', 'CRASH TIME']])
collisions = pd.read_csv('Motor_Vehicle_Collisions_-_Crashes.csv',parse_dates=[['CRASH DATE', 'CRASH TIME']])
collisions.head(2)
collisions.info(verbose=False)
collisions_summary = pd.concat([collisions.count(),collisions.dtypes, collisions.nunique(),collisions.isnull().sum()], axis=1, sort=False)
collisions_summary.columns=['count','dtype','nunique','NaN']
collisions_summary
collisions.describe()
plt.plot(collisions.LONGITUDE, collisions.LATITUDE, 'b.')
collisions.loc[collisions['LONGITUDE'] == 0, ["LONGITUDE", "LATITUDE"]] = np.nan
plt.plot(collisions.LONGITUDE, collisions.LATITUDE, 'b.')
collisions.drop(collisions[collisions.LONGITUDE <-125].index, inplace=True)
collisions.drop(collisions[collisions.LATITUDE <36].index, inplace=True)
plt.plot(collisions.LONGITUDE, collisions.LATITUDE, 'b.')
collisions.drop(collisions[collisions.LONGITUDE >-50].index, inplace=True)
plt.plot(collisions.LONGITUDE, collisions.LATITUDE, 'b.')
collisions.drop(collisions[collisions.LATITUDE >41].index, inplace=True)
plt.plot(collisions.LONGITUDE, collisions.LATITUDE, 'b.')
collisions.drop(collisions[collisions.LONGITUDE < -74.4].index, inplace=True)
plt.plot(collisions.LONGITUDE, collisions.LATITUDE, 'b.')
collisions.describe()
• What is the most dangerous NYC borough / area for a bicycle rider? Use visualization.
from IPython.display import Image
Image(filename='300px-5_Boroughs_Labels_New_York_City_Map.svg.png')
collisions['BOROUGH'].value_counts()
collisions['BOROUGH'].value_counts().plot(kind='barh', color=['yellow', 'orange', 'g','r', 'purple'])
A quick look at the available data, shows that Brookly and the queens have the most collisions. Followed by Manhattan then the Bronx.
We can also do the following data cleaning for a more in-depth analysis:
pd.set_option('display.max_columns', None)
collisions.groupby('BOROUGH').describe()
With the current state of the data, we find that:
--> The most dangerous NYC borough / area for a bicycle rider is Brooklyn. It is followed by Manhattan and the Queens.
• What would you change in the locations of bike stations to increase safety? Use visualization.
cyclists = collisions[(collisions['NUMBER OF CYCLIST INJURED'] > 0) | (collisions['NUMBER OF CYCLIST KILLED'] > 0)]
cyclists['NUMBER OF CYCLIST INJURED'].value_counts()
cyclists['NUMBER OF CYCLIST KILLED'].value_counts()
cyclists.rename(str.lower, axis='columns', inplace=True)
cyclists['contributing factor vehicle 1'].value_counts().head(10)
cyclists['borough'].value_counts()
plt.plot(cyclists.longitude, cyclists.latitude, 'b.')
plt.title('Cyclist injured and killed collisions')
cyclists[['crash date_crash time','borough','zip code','latitude','longitude','number of cyclist injured','number of cyclist killed','contributing factor vehicle 1','vehicle type code 1','vehicle type code 2','collision_id']].head(2)
stations[['stationName','latitude','longitude','borough','totalDocks']].head(2)
cyclists = pd.merge(cyclists[['crash date_crash time','borough','zip code','latitude','longitude','number of cyclist injured','number of cyclist killed','contributing factor vehicle 1','vehicle type code 1','vehicle type code 2','collision_id']], stations[['stationName','latitude','longitude','borough','totalDocks']], on=['longitude','latitude'], how='outer')
cyclists.head(2)
cyclists_summary = pd.concat([cyclists.count(),cyclists.dtypes, cyclists.nunique(),cyclists.isnull().sum()], axis=1, sort=False)
cyclists_summary.columns=['count','dtype','nunique','NaN']
cyclists_summary
len(cyclists)
cyclists.loc[cyclists['collision_id'].notnull(),'collision_id'] = 'collision'
cyclists['collision_id'].fillna('bike_station', inplace=True)
cyclists.rename(columns={'collision_id':'type'}, inplace=True)
# Try different zoom levels
fig = px.scatter(cyclists, x='longitude',y='latitude', hover_name='type', hover_data=['stationName', 'totalDocks','crash date_crash time','number of cyclist injured','number of cyclist killed','contributing factor vehicle 1','vehicle type code 1'], title='Bike accidents and Stations', color='type')
fig.show()
There are regions with lower bike stations densities and others not covered yet.
We should expend in both:
• Where can an accident occur and how close is this from the nearest bike station? Create a predictive model
collisions['CYCLIST'] = 0
collisions.head()
collisions.loc[(collisions['NUMBER OF CYCLIST INJURED'] > 0) | (collisions['NUMBER OF CYCLIST KILLED'] > 0), 'CYCLIST'] =1
collisions['CYCLIST'].value_counts().plot.bar()
Problem identification: biased (oversampling or undersampling ) + timeseries + Geolocation
collisions[collisions['CYCLIST']== 1]['CRASH DATE_CRASH TIME'].describe()
collisions[collisions['CYCLIST']== 0]['CRASH DATE_CRASH TIME'].describe()
collisions[collisions['CYCLIST']== 1]['CRASH DATE_CRASH TIME'].value_counts().head(5)
collisions.rename(str.lower, axis='columns', inplace=True)
lat_max, lat_min = collisions['latitude'].max(), collisions['latitude'].min()
lon_max, lon_min = collisions['longitude'].max(), collisions['longitude'].min()
#setting up 2D array. divide max and min values of coordinates into regular intervals
yedges = np.linspace(lat_min, lat_max, 65)
xedges = np.linspace(lon_min, lon_max, 65)
#generate the histogram using bins defined above
hist, yedges, xedges = np.histogram2d(collisions['latitude'], collisions['longitude'], bins=(yedges, xedges))
plt.imshow(hist, interpolation='nearest', cmap='hot', origin='low', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
plt.colorbar(fraction=0.046, pad=0.04)
collisions.head(1)
collisions.set_index('crash date_crash time',inplace=True)
accidents_list = []
for i in pd.date_range(start='2012-07-01', end='2021-02-08', freq='D'):
hist, yedges, xedges = np.histogram2d(collisions.loc[(collisions.index >= i) & (collisions.index < (i + timedelta(days=1)))].latitude, collisions.loc[(collisions.index >= i) & (collisions.index < (i + timedelta(days=1)))].longitude, bins=(yedges, xedges))
accidents_list.append(hist)
accidents_array = np.array(accidents_list)
print('Accident array shape:', accidents_array.shape)
accidents_array_reshape = accidents_array.reshape(3145,64,64,1)
# Plotting of one day of accidents
plt.imshow(accidents_array[3144], interpolation='nearest', cmap='hot', origin='low',extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
plt.colorbar(fraction=0.046, pad=0.04)
# The array length will reduce from 3145 to 3139 due to the frequency summing of 7 arrays
array_of_frequency = np.zeros(shape=(3139,64,64))
for i in range(3139):
array_of_frequency[i] = (accidents_array[i] + accidents_array[i+1] + accidents_array[i+2] + accidents_array[i+3] + accidents_array[i+4] + accidents_array[i+5] + accidents_array[i+6])/7
array_of_frequency_reshape = array_of_frequency.reshape(3139,64,64,1)
# Plotting of the frequency
plt.imshow(array_of_frequency[12], interpolation='nearest', cmap='hot', origin='low', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
plt.colorbar(fraction=0.046, pad=0.04)
# Not descriminate between a single accident or many accidents
array_of_frequency_reshape[array_of_frequency_reshape > 0] = 1
len(array_of_frequency_reshape[array_of_frequency_reshape==1])
array_of_frequency_reshape.shape
# Splitting the data to train, test and validation datasets 70|20|10
train_array = array_of_frequency_reshape[:2197]
test_array = array_of_frequency_reshape[2197:-314]
validate_array = array_of_frequency_reshape[-314:]
len(train_array), len(test_array), len(validate_array)
Training the model on the preceeding 7 days to prdict the 8th (using keras TimeseriesGenerator)
n_input = 7
train_generator = TimeseriesGenerator(train_array, train_array, length=n_input, batch_size=1)
test_generator = TimeseriesGenerator(test_array, test_array, length=n_input, batch_size=1)
validate_generator = TimeseriesGenerator(validate_array, validate_array, length=n_input, batch_size=1)
# number of samples
print('Length of train generator: %d' % len(train_generator))
print('Length of test generator:', len(test_generator))
print('Length of validate generator:', len(validate_generator))
X, y = validate_generator[300]
print('check shape of arrays:', X.shape, y.shape)
plt.imshow(y.reshape(64,64), interpolation='nearest', cmap='hot', origin='low',extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
plt.colorbar(fraction=0.046, pad=0.04)
# The model
seq = Sequential()
seq.add(ConvLSTM2D(filters=64, kernel_size=(5,5), input_shape=(None, 64, 64, 1), padding='same', activation='relu', return_sequences=True))
seq.add(BatchNormalization())
seq.add(ConvLSTM2D(filters=64, kernel_size=(5,5), padding='same', activation='relu', return_sequences=True))
seq.add(BatchNormalization())
seq.add(ConvLSTM2D(filters=64, kernel_size=(5,5), padding='same', activation='relu', return_sequences=True))
seq.add(BatchNormalization())
seq.add(ConvLSTM2D(filters=64, kernel_size=(5,5),padding='same', activation='relu', return_sequences=False))
seq.add(BatchNormalization())
seq.add(Conv2D(filters=1, kernel_size=(5,5),activation='sigmoid',padding='same', data_format='channels_last'))
# Compile
seq.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0000001), metrics=['binary_accuracy'])
seq.summary()
history_2_bin = seq.fit_generator(train_generator, epochs=10, validation_data=test_generator, callbacks=[EarlyStopping(patience=2)])
seq.save('/content/drive/My Drive/Colab Notebooks/final_model.h5')
seq.save_weights('/content/drive/My Drive/final_weights.h5')
history=pd.DataFrame.from_dict(history_2_bin.history)
history.head()
plt.plot(history['loss'], lw=3, )
plt.plot(history['val_loss'], lw=3)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
history.to_csv('/content/drive/My Drive/final_history.csv')
plt.figure(figsize=(16, 8))
plt.subplot(1,2,1)
plt.title('Ground Truth')
plt.imshow(y.reshape(64,64), interpolation='nearest', cmap='Greys', origin='low', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
plt.colorbar(fraction=0.046, pad=0.04)
plt.subplot(1,2,2)
plt.title('Prediction')
prediction = seq.predict(X)
plt.imshow(prediction.reshape(64,64), interpolation='nearest', cmap='Greys', origin='low', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
plt.colorbar(fraction=0.046, pad=0.04)
The current approach to solving the problem is relevant but requires further improvements. We can also work on the data selected and on the model type or architecture. We will keep this approach and others to further research.
A classification problem requires often other evaluation metrics than the accuracy. Based on the problem faced, the customer may want to focus on lower false negatives. In this case we refer to the confusion matrix, precision, recall, f1 score and roc_auc_score
collisions['on street name'].value_counts(ascending=False).head(10)
collisions['contributing factor vehicle 1'].value_counts().head(20)
collisions['vehicle type code 1'].value_counts().head(20)
collisions['vehicle type code 2'].value_counts().head(20)
# Vehicle type "BICYCLE " mentions in the collisions dataframe
len(collisions[(collisions['vehicle type code 1'] =='BICYCLE') | (collisions['vehicle type code 2'] =='BICYCLE')])
# Vehicle type "BICYCLE " mentions in the cyclists dataframe
len(cyclists[(cyclists['vehicle type code 1'] =='BICYCLE') | (cyclists['vehicle type code 2'] =='BICYCLE')])
collisions['year'] = collisions.index.year
collisions['month'] = collisions.index.month
collisions['day'] = collisions.index.day_name()
collisions['hour'] = collisions.index.hour
collisions['incidents'] = 1
collisions_year = collisions.groupby(['year','borough']).count().iloc[:,-1:]
collisions_year.head()
collisions_year = collisions_year.reset_index()
fig = px.line(collisions_year, x="year", y="incidents", color='borough')
fig.show()
Manhattan has radically been inproving its figures from 2015. The general findings we got before are proved on a yearly bases.
sns.boxplot(data=collisions_year, x='year', y='incidents')
From 2019, the data has been showing a decrease in incident records. In 2020, we notice a radical drop to 2012 values (we suspect it s covid restrictions relations)
collisions_year_month = collisions.groupby(['year','month','borough']).count().iloc[:,-1:]
collisions_year_month.head()
collisions_year_month = collisions_year_month.reset_index()
collisions_year_month[collisions_year_month['year']== 2019].head()
sns.boxplot(data=collisions_year_month, x='month', y='incidents')
The montly records are pretty steady with slightly lower values in winter seasons and an increase from Mai
fig = px.line(collisions_year_month[collisions_year_month['year']== 2020], x="month", y="incidents", title='2020 NYC collisions records' ,color='borough')
fig.show()
The April 2020 sudden drop in records supports our supposition of the effects of covid restrictions effects (A quick check online supports this too but has to be confirmed)
fig = px.line(collisions_year_month[collisions_year_month['year']== 2019], x="month", y="incidents",title='2019 NYC collisions records', color='borough')
fig.show()
fig = px.line(collisions_year_month[collisions_year_month['year']== 2016], x="month", y="incidents",title='2016 NYC collisions records', color='borough')
fig.show()
fig = px.line(collisions_year_month[collisions_year_month['year']== 2015], x="month", y="incidents",title='2015 NYC collisions records', color='borough')
fig.show()
fig = px.line(collisions_year_month[collisions_year_month['year']== 2014], x="month", y="incidents",title='2014 NYC collisions records', color='borough')
fig.show()
collisions_year_month_day = collisions.groupby(['year','month','day','borough']).count().iloc[:,-1:]
collisions_year_month_day = collisions_year_month_day.reset_index()
collisions_year_month_day.head()
sns.boxplot(data=collisions_year_month_day, x='day', y='incidents')
The weekend records are lowers compaired to the rest of the active week.
sns.boxplot(data=collisions_year_month_day, x='borough', y='incidents')
collisions_year_month_hour = collisions.groupby(['year','month','day','hour','borough']).count().iloc[:,-1:]
collisions_year_month_hour.head(20)
collisions_year_month_hour = collisions_year_month_hour.reset_index()
sns.boxplot(data=collisions_year_month_hour, x='hour', y='incidents')
There are higher risks of collisions during the standard active hours of the day from 8am -10pm
sns.boxplot(data=collisions_year_month_hour, x='borough', y='incidents')
Brooklyn keeps the highest levels of risks even by hourly data
Note: Similar analysis can be done to only the cyclists or even to the trio [pedestrian, cyclists, bikes]
What other data sources would be interesting to correlate?